Lab 04 - Text Analysis

Courtney Stowers

Library

library(tidyverse)            # data wrangling
library(gutenbergr)           # Gutenberg project public domain text 
library(quanteda)             # quantitative analysis of textual data
library("quanteda.textstats") # text stats extension
library(readtext)             # load textual data to corpus
library(stopwords)            # alternative package to source stopwords
library(TeXCheckR)            # alternative package to source stopwords & process text data
library(cleanNLP)             # alternative package to source stopwords & process text data

# Relative file paths
here::i_am("labs/Lab04_Review_Session.Rmd")
library(here)

Anne of Green Gables

# View authors on Gutenberg Project (https://www.gutenberg.org/)
# https://docs.ropensci.org/gutenbergr/index.html
gutenberg_authors %>% filter(str_detect(author, "Montgomery")) %>% select(gutenberg_author_id, author) %>% head(10) 
## # A tibble: 10 × 2
##    gutenberg_author_id author                                       
##                  <int> <chr>                                        
##  1                  36 Montgomery, L. M. (Lucy Maud)                
##  2                2113 Skinner, Charles M. (Charles Montgomery)     
##  3                3290 Beck, James M. (James Montgomery)            
##  4                5306 Bird, Robert Montgomery                      
##  5                7342 Montgomery, D. H. (David Henry)              
##  6                8392 Montgomery, Frances Trego                    
##  7                9003 Ward, John Montgomery                        
##  8               25940 Flagg, James Montgomery                      
##  9               33486 Montgomery, H. B. (Helen Barrett)            
## 10               34150 Montgomery, Rutherford G. (Rutherford George)
# View works
gutenberg_works(gutenberg_author_id == 36) %>% select(gutenberg_id, title, author )
## # A tibble: 17 × 3
##    gutenberg_id title                                            author         
##           <int> <chr>                                            <chr>          
##  1           45 Anne of Green Gables                             Montgomery, L.…
##  2           47 Anne of Avonlea                                  Montgomery, L.…
##  3           51 Anne of the Island                               Montgomery, L.…
##  4          316 The Golden Road                                  Montgomery, L.…
##  5          544 Anne's House of Dreams                           Montgomery, L.…
##  6         1354 Chronicles of Avonlea                            Montgomery, L.…
##  7         3796 Rilla of Ingleside                               Montgomery, L.…
##  8         5340 Further Chronicles of Avonlea                    Montgomery, L.…
##  9         5341 Kilmeny of the Orchard                           Montgomery, L.…
## 10         5342 The Story Girl                                   Montgomery, L.…
## 11        24873 Lucy Maud Montgomery Short Stories, 1896 to 1901 Montgomery, L.…
## 12        24874 Lucy Maud Montgomery Short Stories, 1902 to 1903 Montgomery, L.…
## 13        24875 Lucy Maud Montgomery Short Stories, 1904         Montgomery, L.…
## 14        24876 Lucy Maud Montgomery Short Stories, 1905 to 1906 Montgomery, L.…
## 15        24877 Lucy Maud Montgomery Short Stories, 1907 to 1908 Montgomery, L.…
## 16        24878 Lucy Maud Montgomery Short Stories, 1909 to 1922 Montgomery, L.…
## 17        67979 The Blue Castle: a novel                         Montgomery, L.…
# Filter to see books with Anne in title
gutenberg_works(gutenberg_author_id == 36) %>% select(gutenberg_id, title, author ) %>% filter(str_detect(title, "Anne")) 
## # A tibble: 4 × 3
##   gutenberg_id title                  author                       
##          <int> <chr>                  <chr>                        
## 1           45 Anne of Green Gables   Montgomery, L. M. (Lucy Maud)
## 2           47 Anne of Avonlea        Montgomery, L. M. (Lucy Maud)
## 3           51 Anne of the Island     Montgomery, L. M. (Lucy Maud)
## 4          544 Anne's House of Dreams Montgomery, L. M. (Lucy Maud)
anne_books <- gutenberg_works(gutenberg_author_id == 36) %>% filter(str_detect(title, "Anne")) %>%
  gutenberg_download(meta_fields = c("title"))
anne_books 
## # A tibble: 38,428 × 3
##    gutenberg_id text                                                     title  
##           <int> <chr>                                                    <chr>  
##  1           45 "ANNE OF GREEN GABLES"                                   Anne o…
##  2           45 ""                                                       Anne o…
##  3           45 "By Lucy Maud Montgomery"                                Anne o…
##  4           45 ""                                                       Anne o…
##  5           45 ""                                                       Anne o…
##  6           45 ""                                                       Anne o…
##  7           45 "Table of Contents"                                      Anne o…
##  8           45 ""                                                       Anne o…
##  9           45 "     CHAPTER I          Mrs. Rachel Lynde Is Surprised" Anne o…
## 10           45 "     CHAPTER II         Matthew Cuthbert Is Surprised"  Anne o…
## # ℹ 38,418 more rows
# Create concatenated dataset
anne_books_text <- anne_books %>%
  dplyr::group_by(title) %>%
  dplyr::summarise(text_whole = paste(text, collapse = " "))

# Replace special quotation mark
anne_books_text$text_whole <- gsub("’", "'", anne_books_text$text_whole)

# View data
anne_books_text 
## # A tibble: 4 × 2
##   title                  text_whole                                             
##   <chr>                  <chr>                                                  
## 1 Anne of Avonlea        [Illustration]     ANNE OF AVONLEA  by Lucy Maud Montg…
## 2 Anne of Green Gables   ANNE OF GREEN GABLES  By Lucy Maud Montgomery    Table…
## 3 Anne of the Island     Anne of the Island  by Lucy Maud Montgomery     All pr…
## 4 Anne's House of Dreams Anne's House of Dreams   by  Lucy Maud Montgomery   “T…
anne_corpus <- corpus(anne_books_text, docid_field="title", text_field="text_whole" )

anne_corpus 
## Corpus consisting of 4 documents.
## Anne of Avonlea :
## "[Illustration]     ANNE OF AVONLEA  by Lucy Maud Montgomery ..."
## 
## Anne of Green Gables :
## "ANNE OF GREEN GABLES  By Lucy Maud Montgomery    Table of Co..."
## 
## Anne of the Island :
## "Anne of the Island  by Lucy Maud Montgomery     All precious..."
## 
## Anne's House of Dreams :
## "Anne's House of Dreams   by  Lucy Maud Montgomery   “To Laur..."
anne_corpus[1]
## Corpus consisting of 1 document.
## Anne of Avonlea :
## "[Illustration]     ANNE OF AVONLEA  by Lucy Maud Montgomery ..."
# summarize corpus
summary(anne_corpus)
## Corpus consisting of 4 documents, showing 4 documents:
## 
##                    Text Types Tokens Sentences
##         Anne of Avonlea  8358 106707      5671
##    Anne of Green Gables  8453 121673      6846
##      Anne of the Island  8831  92323      5921
##  Anne's House of Dreams  7816  97515      5709
# remove punctuation 
anne_corpus_tokens_orig <- tokens( anne_corpus, what="word", remove_punct=TRUE )
head( anne_corpus_tokens_orig )
## Tokens consisting of 4 documents.
## Anne of Avonlea :
##  [1] "Illustration" "ANNE"         "OF"           "AVONLEA"      "by"          
##  [6] "Lucy"         "Maud"         "Montgomery"   "To"           "my"          
## [11] "former"       "teacher"     
## [ ... and 88,798 more ]
## 
## Anne of Green Gables :
##  [1] "ANNE"       "OF"         "GREEN"      "GABLES"     "By"        
##  [6] "Lucy"       "Maud"       "Montgomery" "Table"      "of"        
## [11] "Contents"   "CHAPTER"   
## [ ... and 103,149 more ]
## 
## Anne of the Island :
##  [1] "Anne"       "of"         "the"        "Island"     "by"        
##  [6] "Lucy"       "Maud"       "Montgomery" "All"        "precious"  
## [11] "things"     "discovered"
## [ ... and 76,612 more ]
## 
## Anne's House of Dreams :
##  [1] "Anne's"     "House"      "of"         "Dreams"     "by"        
##  [6] "Lucy"       "Maud"       "Montgomery" "To"         "Laura"     
## [11] "in"         "memory"    
## [ ... and 80,421 more ]
# convert to lower case
anne_corpus_tokens_orig <- tokens_tolower( anne_corpus_tokens_orig, keep_acronyms=TRUE )
head( anne_corpus_tokens_orig )
## Tokens consisting of 4 documents.
## Anne of Avonlea :
##  [1] "illustration" "ANNE"         "OF"           "AVONLEA"      "by"          
##  [6] "lucy"         "maud"         "montgomery"   "to"           "my"          
## [11] "former"       "teacher"     
## [ ... and 88,798 more ]
## 
## Anne of Green Gables :
##  [1] "ANNE"       "OF"         "GREEN"      "GABLES"     "by"        
##  [6] "lucy"       "maud"       "montgomery" "table"      "of"        
## [11] "contents"   "CHAPTER"   
## [ ... and 103,149 more ]
## 
## Anne of the Island :
##  [1] "anne"       "of"         "the"        "island"     "by"        
##  [6] "lucy"       "maud"       "montgomery" "all"        "precious"  
## [11] "things"     "discovered"
## [ ... and 76,612 more ]
## 
## Anne's House of Dreams :
##  [1] "anne's"     "house"      "of"         "dreams"     "by"        
##  [6] "lucy"       "maud"       "montgomery" "to"         "laura"     
## [11] "in"         "memory"    
## [ ... and 80,421 more ]
anne_corpus_tokens <- tokens_remove( anne_corpus_tokens_orig, c( stopwords("english"), "nbsp" ), padding=F )

head(anne_corpus_tokens)
## Tokens consisting of 4 documents.
## Anne of Avonlea :
##  [1] "illustration" "ANNE"         "AVONLEA"      "lucy"         "maud"        
##  [6] "montgomery"   "former"       "teacher"      "HATTIE"       "GORDON"      
## [11] "SMITH"        "grateful"    
## [ ... and 43,999 more ]
## 
## Anne of Green Gables :
##  [1] "ANNE"       "GREEN"      "GABLES"     "lucy"       "maud"      
##  [6] "montgomery" "table"      "contents"   "CHAPTER"    "mrs"       
## [11] "rachel"     "lynde"     
## [ ... and 49,929 more ]
## 
## Anne of the Island :
##  [1] "anne"       "island"     "lucy"       "maud"       "montgomery"
##  [6] "precious"   "things"     "discovered" "late"       "seek"      
## [11] "issue"      "forth"     
## [ ... and 38,545 more ]
## 
## Anne's House of Dreams :
##  [1] "anne's"     "house"      "dreams"     "lucy"       "maud"      
##  [6] "montgomery" "laura"      "memory"     "olden"      "time"      
## [11] "CONTENTS"   "chapter"   
## [ ... and 39,065 more ]
anne_corpus_tokens <- tokens_wordstem( anne_corpus_tokens )
anne_corpus_tokens
## Tokens consisting of 4 documents.
## Anne of Avonlea :
##  [1] "illustr"    "ANNE"       "AVONLEA"    "luci"       "maud"      
##  [6] "montgomeri" "former"     "teacher"    "HATTIE"     "GORDON"    
## [11] "SMITH"      "grate"     
## [ ... and 43,999 more ]
## 
## Anne of Green Gables :
##  [1] "ANNE"       "GREEN"      "GABLES"     "luci"       "maud"      
##  [6] "montgomeri" "tabl"       "content"    "CHAPTER"    "mrs"       
## [11] "rachel"     "lynd"      
## [ ... and 49,929 more ]
## 
## Anne of the Island :
##  [1] "ann"        "island"     "luci"       "maud"       "montgomeri"
##  [6] "precious"   "thing"      "discov"     "late"       "seek"      
## [11] "issu"       "forth"     
## [ ... and 38,545 more ]
## 
## Anne's House of Dreams :
##  [1] "ann"        "hous"       "dream"      "luci"       "maud"      
##  [6] "montgomeri" "laura"      "memori"     "olden"      "time"      
## [11] "CONTENTS"   "chapter"   
## [ ... and 39,065 more ]
# find frequently co-occuring words (typically compound words)
anne_corpus_ngram <- tokens_ngrams( anne_corpus_tokens, n=2 ) %>% dfm()
anne_corpus_ngram  %>% textstat_frequency( n=50 )
##             feature frequency rank docfreq group
## 1          said_ann       550    1       4   all
## 2          mrs_lynd       314    2       4   all
## 3       captain_jim       295    3       1   all
## 4     miss_cornelia       224    4       1   all
## 5        green_gabl       198    5       4   all
## 6      said_marilla       197    6       4   all
## 7       mr_harrison       178    7       3   all
## 8        mrs_rachel       164    8       4   all
## 9     miss_lavendar       160    9       3   all
## 10       said_diana       105   10       4   all
## 11    gilbert_blyth        90   11       4   all
## 12        mrs_allan        89   12       4   all
## 13       littl_girl        88   13       4   all
## 14         said_mrs        84   14       4   all
## 15        four_wind        84   14       1   all
## 16      ann_shirley        82   16       4   all
## 17         ann_said        78   17       4   all
## 18    aunt_jamesina        76   18       2   all
## 19 charlotta_fourth        75   19       3   all
## 20       oh_marilla        74   20       3   all
## 21       miss_staci        74   20       3   all
## 22        said_miss        74   20       4   all
## 23          ask_ann        73   23       4   all
## 24       rubi_gilli        71   24       4   all
## 25   mistress_blyth        70   25       1   all
## 26     miss_shirley        69   26       4   all
## 27       littl_hous        69   26       4   all
## 28        ever_sinc        67   28       4   all
## 29         lynd_say        66   29       4   all
## 30        come_back        65   30       4   all
## 31         year_ago        63   31       4   all
## 32        look_like        59   32       4   all
## 33         ann_look        59   32       4   all
## 34      patti_place        58   34       2   all
## 35        want_know        57   35       4   all
## 36        said_davi        57   35       3   all
## 37         well_now        57   35       3   all
## 38     said_gilbert        56   38       4   all
## 39         ann_felt        54   39       4   all
## 40           oh_ann        54   39       4   all
## 41       mrs_doctor        54   39       1   all
## 42        just_like        53   42       4   all
## 43       last_night        49   43       4   all
## 44        mrs_barri        49   43       2   all
## 45        ann_diana        48   45       3   all
## 46      rachel_lynd        47   46       4   all
## 47          one_day        47   46       4   all
## 48      ann_gilbert        47   46       4   all
## 49        owen_ford        47   46       1   all
## 50       mr_phillip        46   50       2   all
# find frequently co-occuring words (typically compound words)
anne_corpus_ngram3 <- tokens_ngrams( anne_corpus_tokens, n=3 ) %>% dfm()
anne_corpus_ngram3  %>% textstat_frequency( n=50 )
##                feature frequency rank docfreq group
## 1         mrs_lynd_say        63    1       3   all
## 2   said_miss_cornelia        45    2       1   all
## 3   miss_shirley_ma'am        43    3       3   all
## 4      mrs_rachel_lynd        38    4       4   all
## 5     said_captain_jim        38    4       1   all
## 6      mrs_doctor_dear        38    4       1   all
## 7      said_mrs_rachel        29    7       4   all
## 8     lake_shine_water        28    8       4   all
## 9   said_aunt_jamesina        24    9       1   all
## 10       said_mrs_lynd        17   10       4   all
## 11  said_miss_lavendar        16   11       1   all
## 12       ann_want_know        15   12       2   all
## 13    four_wind_harbor        15   12       1   all
## 14    said_mr_harrison        14   14       2   all
## 15       mrs_lynd_said        14   14       3   all
## 16   mrs_harmon_andrew        14   14       3   all
## 17       just_like_man        13   17       1   all
## 18     jog_along_black        12   18       1   all
## 19    along_black_mare        12   18       1   all
## 20    race_know_joseph        12   18       1   all
## 21    littl_hous_dream        12   18       1   all
## 22 princ_edward_island        11   22       3   all
## 23  shall_never_forget        11   22       3   all
## 24     never_said_word        11   22       4   all
## 25    ann_said_marilla        11   22       4   all
## 26     come_green_gabl        11   22       4   all
## 27        glen_st_mari        11   22       1   all
## 28   said_ann_dreamili        10   28       4   all
## 29         old_st_john        10   28       1   all
## 30    littl_stone_hous         9   30       1   all
## 31     back_green_gabl         9   30       3   all
## 32    drew_long_breath         9   30       2   all
## 33       davi_said_ann         9   30       2   all
## 34      ann_shook_head         9   30       4   all
## 35     oh_miss_shirley         9   30       1   all
## 36      well_now_dunno         9   30       1   all
## 37  miss_cornelia_said         9   30       1   all
## 38        mr_mrs_allan         8   38       4   all
## 39      green_gabl_ann         8   38       3   all
## 40    said_ann_thought         8   38       3   all
## 41    mr_harmon_andrew         8   38       2   all
## 42       said_ann_soft         8   38       4   all
## 43  _you__know_teacher         8   38       1   all
## 44   mrs_morgan_heroin         8   38       1   all
## 45    allan_miss_staci         8   38       2   all
## 46      young_mari_joe         8   38       2   all
## 47      said_ann_decid         8   38       3   all
## 48      look_much_like         8   38       4   all
## 49     green_gabl_said         8   38       3   all
## 50         oh_said_ann         8   38       2   all
anne_corpus_tokens %>% dfm() %>% dfm_wordstem() %>% topfeatures( 50 )
##     ann    said marilla      go    like     one     mrs   never    just   littl 
##    3984    2331    1307    1302    1296    1098    1072    1060    1033    1031 
##   think    know     say    look    come    miss    well   diana   thing      oh 
##    1019     977     927     915     859     817     806     795     767     740 
##     see     old    good    time gilbert     now     get    even    much    girl 
##     705     692     688     685     656     637     611     610     606     594 
## thought    want     can     day    ever    love    home      mr    came    make 
##     593     587     586     558     544     541     540     529     520     509 
##    went    must     eye   alway    year    tell    back    seem     ask    feel 
##     501     491     484     483     481     472     464     463     456     454

Stopwords

length(quanteda::stopwords("english"))
## [1] 175
length(stopwords::stopwords(source = "smart"))
## [1] 571
length(stopwords::stopwords(source = "snowball"))
## [1] 175
length(stopwords::stopwords(source = "stopwords-iso"))
## [1] 1298
quanteda::stopwords("english") %>% head(100)
##   [1] "i"          "me"         "my"         "myself"     "we"        
##   [6] "our"        "ours"       "ourselves"  "you"        "your"      
##  [11] "yours"      "yourself"   "yourselves" "he"         "him"       
##  [16] "his"        "himself"    "she"        "her"        "hers"      
##  [21] "herself"    "it"         "its"        "itself"     "they"      
##  [26] "them"       "their"      "theirs"     "themselves" "what"      
##  [31] "which"      "who"        "whom"       "this"       "that"      
##  [36] "these"      "those"      "am"         "is"         "are"       
##  [41] "was"        "were"       "be"         "been"       "being"     
##  [46] "have"       "has"        "had"        "having"     "do"        
##  [51] "does"       "did"        "doing"      "would"      "should"    
##  [56] "could"      "ought"      "i'm"        "you're"     "he's"      
##  [61] "she's"      "it's"       "we're"      "they're"    "i've"      
##  [66] "you've"     "we've"      "they've"    "i'd"        "you'd"     
##  [71] "he'd"       "she'd"      "we'd"       "they'd"     "i'll"      
##  [76] "you'll"     "he'll"      "she'll"     "we'll"      "they'll"   
##  [81] "isn't"      "aren't"     "wasn't"     "weren't"    "hasn't"    
##  [86] "haven't"    "hadn't"     "doesn't"    "don't"      "didn't"    
##  [91] "won't"      "wouldn't"   "shan't"     "shouldn't"  "can't"     
##  [96] "cannot"     "couldn't"   "mustn't"    "let's"      "that's"
stopwords::stopwords(source = "smart") %>% head(100)
##   [1] "a"            "a's"          "able"         "about"        "above"       
##   [6] "according"    "accordingly"  "across"       "actually"     "after"       
##  [11] "afterwards"   "again"        "against"      "ain't"        "all"         
##  [16] "allow"        "allows"       "almost"       "alone"        "along"       
##  [21] "already"      "also"         "although"     "always"       "am"          
##  [26] "among"        "amongst"      "an"           "and"          "another"     
##  [31] "any"          "anybody"      "anyhow"       "anyone"       "anything"    
##  [36] "anyway"       "anyways"      "anywhere"     "apart"        "appear"      
##  [41] "appreciate"   "appropriate"  "are"          "aren't"       "around"      
##  [46] "as"           "aside"        "ask"          "asking"       "associated"  
##  [51] "at"           "available"    "away"         "awfully"      "b"           
##  [56] "be"           "became"       "because"      "become"       "becomes"     
##  [61] "becoming"     "been"         "before"       "beforehand"   "behind"      
##  [66] "being"        "believe"      "below"        "beside"       "besides"     
##  [71] "best"         "better"       "between"      "beyond"       "both"        
##  [76] "brief"        "but"          "by"           "c"            "c'mon"       
##  [81] "c's"          "came"         "can"          "can't"        "cannot"      
##  [86] "cant"         "cause"        "causes"       "certain"      "certainly"   
##  [91] "changes"      "clearly"      "co"           "com"          "come"        
##  [96] "comes"        "concerning"   "consequently" "consider"     "considering"
stopwords::stopwords(source = "snowball") %>% head(100)
##   [1] "i"          "me"         "my"         "myself"     "we"        
##   [6] "our"        "ours"       "ourselves"  "you"        "your"      
##  [11] "yours"      "yourself"   "yourselves" "he"         "him"       
##  [16] "his"        "himself"    "she"        "her"        "hers"      
##  [21] "herself"    "it"         "its"        "itself"     "they"      
##  [26] "them"       "their"      "theirs"     "themselves" "what"      
##  [31] "which"      "who"        "whom"       "this"       "that"      
##  [36] "these"      "those"      "am"         "is"         "are"       
##  [41] "was"        "were"       "be"         "been"       "being"     
##  [46] "have"       "has"        "had"        "having"     "do"        
##  [51] "does"       "did"        "doing"      "would"      "should"    
##  [56] "could"      "ought"      "i'm"        "you're"     "he's"      
##  [61] "she's"      "it's"       "we're"      "they're"    "i've"      
##  [66] "you've"     "we've"      "they've"    "i'd"        "you'd"     
##  [71] "he'd"       "she'd"      "we'd"       "they'd"     "i'll"      
##  [76] "you'll"     "he'll"      "she'll"     "we'll"      "they'll"   
##  [81] "isn't"      "aren't"     "wasn't"     "weren't"    "hasn't"    
##  [86] "haven't"    "hadn't"     "doesn't"    "don't"      "didn't"    
##  [91] "won't"      "wouldn't"   "shan't"     "shouldn't"  "can't"     
##  [96] "cannot"     "couldn't"   "mustn't"    "let's"      "that's"
stopwords::stopwords(source = "stopwords-iso") %>% head(100)
##   [1] "'ll"           "'tis"          "'twas"         "'ve"          
##   [5] "10"            "39"            "a"             "a's"          
##   [9] "able"          "ableabout"     "about"         "above"        
##  [13] "abroad"        "abst"          "accordance"    "according"    
##  [17] "accordingly"   "across"        "act"           "actually"     
##  [21] "ad"            "added"         "adj"           "adopted"      
##  [25] "ae"            "af"            "affected"      "affecting"    
##  [29] "affects"       "after"         "afterwards"    "ag"           
##  [33] "again"         "against"       "ago"           "ah"           
##  [37] "ahead"         "ai"            "ain't"         "aint"         
##  [41] "al"            "all"           "allow"         "allows"       
##  [45] "almost"        "alone"         "along"         "alongside"    
##  [49] "already"       "also"          "although"      "always"       
##  [53] "am"            "amid"          "amidst"        "among"        
##  [57] "amongst"       "amoungst"      "amount"        "an"           
##  [61] "and"           "announce"      "another"       "any"          
##  [65] "anybody"       "anyhow"        "anymore"       "anyone"       
##  [69] "anything"      "anyway"        "anyways"       "anywhere"     
##  [73] "ao"            "apart"         "apparently"    "appear"       
##  [77] "appreciate"    "appropriate"   "approximately" "aq"           
##  [81] "ar"            "are"           "area"          "areas"        
##  [85] "aren"          "aren't"        "arent"         "arise"        
##  [89] "around"        "arpa"          "as"            "aside"        
##  [93] "ask"           "asked"         "asking"        "asks"         
##  [97] "associated"    "at"            "au"            "auth"
contractions <- TeXCheckR::valid_English_contractions %>% tolower()

contractions
##   [1] "ain't"          "aren't"         "can't"          "could've"      
##   [5] "couldn't"       "couldn't've"    "didn't"         "doesn't"       
##   [9] "don't"          "hadn't"         "hadn't've"      "hasn't"        
##  [13] "haven't"        "he'd"           "he'd've"        "he'll"         
##  [17] "he's"           "how'd"          "how'll"         "how's"         
##  [21] "i'd"            "i'd've"         "i'll"           "i'm"           
##  [25] "i've"           "isn't"          "it'd"           "it'd've"       
##  [29] "it'll"          "it's"           "let's"          "ma'am"         
##  [33] "mightn't"       "mightn't've"    "might've"       "mustn't"       
##  [37] "must've"        "needn't"        "not've"         "o'clock"       
##  [41] "oughtn't"       "'ow's'at"       "shan't"         "she'd"         
##  [45] "she'd've"       "she'll"         "she's"          "should've"     
##  [49] "shouldn't"      "shouldn't've"   "somebody'd"     "somebody'd've" 
##  [53] "somebody'll"    "somebody's"     "someone'd"      "someone'd've"  
##  [57] "someone'll"     "someone's"      "something'd"    "something'd've"
##  [61] "something'll"   "something's"    "that'll"        "that's"        
##  [65] "there'd"        "there'd've"     "there're"       "there's"       
##  [69] "they'd"         "they'd've"      "they'll"        "they're"       
##  [73] "they've"        "'twas"          "wasn't"         "we'd"          
##  [77] "we'd've"        "we'll"          "we're"          "we've"         
##  [81] "weren't"        "what'll"        "what're"        "what's"        
##  [85] "what've"        "when's"         "where'd"        "where's"       
##  [89] "where've"       "who'd"          "who'd've"       "who'll"        
##  [93] "who're"         "who's"          "who've"         "why'll"        
##  [97] "why're"         "why's"          "won't"          "would've"      
## [101] "wouldn't"       "wouldn't've"    "y'all"          "y'all'll"      
## [105] "y'all'd've"     "you'd"          "you'd've"       "you'll"        
## [109] "you're"         "you've"
freq_words <- cleanNLP::word_frequency
length(cleanNLP::word_frequency$word)
## [1] 150000
cleanNLP::word_frequency$word %>% head(100)
##   [1] "the"         "of"          "and"         "to"          "a"          
##   [6] "in"          "for"         "is"          "on"          "that"       
##  [11] "by"          "this"        "with"        "i"           "you"        
##  [16] "it"          "not"         "or"          "be"          "are"        
##  [21] "from"        "at"          "as"          "your"        "all"        
##  [26] "have"        "new"         "more"        "an"          "was"        
##  [31] "we"          "will"        "home"        "can"         "us"         
##  [36] "about"       "if"          "page"        "my"          "has"        
##  [41] "search"      "free"        "but"         "our"         "one"        
##  [46] "other"       "do"          "no"          "information" "time"       
##  [51] "they"        "site"        "he"          "up"          "may"        
##  [56] "what"        "which"       "their"       "news"        "out"        
##  [61] "use"         "any"         "there"       "see"         "only"       
##  [66] "so"          "his"         "when"        "contact"     "here"       
##  [71] "business"    "who"         "web"         "also"        "now"        
##  [76] "help"        "get"         "pm"          "view"        "online"     
##  [81] "c"           "e"           "first"       "am"          "been"       
##  [86] "would"       "how"         "were"        "me"          "s"          
##  [91] "services"    "some"        "these"       "click"       "its"        
##  [96] "like"        "service"     "x"           "than"        "find"

Anne of Green Gables Alternate Stopwords

anne_corpus_tokens_alt <- tokens_remove( anne_corpus_tokens_orig, c(stopwords::stopwords(source = "stopwords-iso"), "nbsp"), padding=F )

head(anne_corpus_tokens_alt)
## Tokens consisting of 4 documents.
## Anne of Avonlea :
##  [1] "illustration" "ANNE"         "AVONLEA"      "lucy"         "maud"        
##  [6] "montgomery"   "teacher"      "HATTIE"       "GORDON"       "SMITH"       
## [11] "grateful"     "remembrance" 
## [ ... and 28,674 more ]
## 
## Anne of Green Gables :
##  [1] "ANNE"       "GREEN"      "GABLES"     "lucy"       "maud"      
##  [6] "montgomery" "table"      "contents"   "CHAPTER"    "rachel"    
## [11] "lynde"      "surprised" 
## [ ... and 32,589 more ]
## 
## Anne of the Island :
##  [1] "anne"       "island"     "lucy"       "maud"       "montgomery"
##  [6] "precious"   "discovered" "late"       "seek"       "issue"     
## [11] "love"       "sequel"    
## [ ... and 25,699 more ]
## 
## Anne's House of Dreams :
##  [1] "anne's"     "house"      "dreams"     "lucy"       "maud"      
##  [6] "montgomery" "laura"      "memory"     "olden"      "time"      
## [11] "CONTENTS"   "chapter"   
## [ ... and 25,287 more ]
anne_corpus_tokens_alt <- tokens_wordstem( anne_corpus_tokens_alt )
anne_corpus_tokens_alt
## Tokens consisting of 4 documents.
## Anne of Avonlea :
##  [1] "illustr"    "ANNE"       "AVONLEA"    "luci"       "maud"      
##  [6] "montgomeri" "teacher"    "HATTIE"     "GORDON"     "SMITH"     
## [11] "grate"      "remembr"   
## [ ... and 28,674 more ]
## 
## Anne of Green Gables :
##  [1] "ANNE"       "GREEN"      "GABLES"     "luci"       "maud"      
##  [6] "montgomeri" "tabl"       "content"    "CHAPTER"    "rachel"    
## [11] "lynd"       "surpris"   
## [ ... and 32,589 more ]
## 
## Anne of the Island :
##  [1] "ann"        "island"     "luci"       "maud"       "montgomeri"
##  [6] "precious"   "discov"     "late"       "seek"       "issu"      
## [11] "love"       "sequel"    
## [ ... and 25,699 more ]
## 
## Anne's House of Dreams :
##  [1] "ann"        "hous"       "dream"      "luci"       "maud"      
##  [6] "montgomeri" "laura"      "memori"     "olden"      "time"      
## [11] "CONTENTS"   "chapter"   
## [ ... and 25,287 more ]
# find frequently co-occuring words (typically compound words)
anne_corpus_ngram_alt <- tokens_ngrams( anne_corpus_tokens_alt, n=2 ) %>% dfm()
anne_corpus_ngram_alt  %>% textstat_frequency( n=50 )
##             feature frequency rank docfreq group
## 1       captain_jim       295    1       1   all
## 2        green_gabl       198    2       4   all
## 3     gilbert_blyth        90    3       4   all
## 4       ann_shirley        82    4       4   all
## 5     aunt_jamesina        76    5       2   all
## 6  charlotta_fourth        75    6       3   all
## 7        rubi_gilli        71    7       4   all
## 8    mistress_blyth        70    8       1   all
## 9       ann_gilbert        67    9       4   all
## 10      marilla_ann        66   10       4   all
## 11        ann_diana        66   10       4   all
## 12          ann_ann        61   12       4   all
## 13      ann_marilla        50   13       4   all
## 14      rachel_lynd        47   14       4   all
## 15        owen_ford        47   14       1   all
## 16       white_sand        45   16       3   all
## 17         josi_pye        45   16       2   all
## 18          cri_ann        44   18       4   all
## 19    shirley_ma'am        43   19       3   all
## 20        dick_moor        43   19       1   all
## 21      jane_andrew        42   21       4   all
## 22         ann_look        40   22       4   all
## 23        east_gabl        40   22       3   all
## 24     charli_sloan        40   22       4   all
## 25         mari_joe        39   25       2   all
## 26       hous_dream        38   26       3   all
## 27       shook_head        38   26       4   all
## 28       haunt_wood        38   26       4   all
## 29        diana_ann        36   29       4   all
## 30         time_ann        35   30       4   all
## 31        ann_laugh        35   30       4   all
## 32     kindr_spirit        33   32       4   all
## 33        laugh_ann        33   32       4   all
## 34   moodi_spurgeon        31   34       3   all
## 35    sunday_school        31   34       3   all
## 36        ann_deari        31   34       1   all
## 37        echo_lodg        30   37       3   all
## 38         told_ann        30   37       4   all
## 39       stone_hous        29   39       2   all
## 40         ann_feel        29   39       4   all
## 41         paul_irv        28   41       2   all
## 42          day_ann        28   41       4   all
## 43       lover_lane        28   41       4   all
## 44       lake_shine        28   41       4   all
## 45      shine_water        28   41       4   all
## 46      gilbert_ann        27   46       4   all
## 47 matthew_cuthbert        27   46       2   all
## 48       answer_ann        26   48       4   all
## 49        ann_lesli        26   48       1   all
## 50         sigh_ann        25   50       4   all
# find frequently co-occuring words (typically compound words)
anne_corpus_ngram3_alt <- tokens_ngrams( anne_corpus_tokens_alt, n=3 ) %>% dfm()
anne_corpus_ngram3_alt  %>% textstat_frequency( n=50 )
##                      feature frequency rank docfreq group
## 1           lake_shine_water        28    1       4   all
## 2             ann_green_gabl        13    2       4   all
## 3          gilbert_blyth_ann        12    3       3   all
## 4             green_gabl_ann        12    3       4   all
## 5             jog_black_mare        12    3       1   all
## 6        princ_edward_island        11    6       3   all
## 7             ann_shook_head        10    7       4   all
## 8            ann_captain_jim         9    8       1   all
## 9             ann_clasp_hand         7    9       2   all
## 10         wrong_upper_stori         7    9       1   all
## 11           green_gabl_even         7    9       3   all
## 12           stay_green_gabl         7    9       2   all
## 13        green_gabl_marilla         7    9       3   all
## 14 moodi_spurgeon_macpherson         7    9       2   all
## 15    mistress_blyth_captain         7    9       1   all
## 16         blyth_captain_jim         7    9       1   all
## 17           captain_jim_ann         7    9       1   all
## 18          white_sand_hotel         6   18       2   all
## 19         ann_gilbert_blyth         6   18       2   all
## 20       gilbert_captain_jim         6   18       1   all
## 21        belong_race_joseph         6   18       1   all
## 22         john_henri_carter         5   22       1   all
## 23         blyth_ann_shirley         5   22       3   all
## 24          arriv_green_gabl         5   22       3   all
## 25          ann_told_marilla         5   22       2   all
## 26         return_green_gabl         5   22       3   all
## 27            green_gabl_day         5   22       3   all
## 28            sit_porch_step         5   22       2   all
## 29        hear_gilbert_blyth         5   22       3   all
## 30            jane_rubi_josi         5   22       1   all
## 31      captain_jim_cornelia         5   22       1   all
## 32        captain_jim_slowli         5   22       1   all
## 33         captain_jim_shook         5   22       1   all
## 34            jim_shook_head         5   22       1   all
## 35      cornelia_captain_jim         5   22       1   all
## 36          captain_jim_told         5   22       1   all
## 37       captain_jim_gilbert         5   22       1   all
## 38      luci_maud_montgomeri         4   38       4   all
## 39     villag_improv_societi         4   38       1   all
## 40     educ_public_sentiment         4   38       1   all
## 41           green_gabl_lane         4   38       2   all
## 42             ann_bed_night         4   38       2   all
## 43           left_green_gabl         4   38       2   all
## 44         ann_orchard_slope         4   38       3   all
## 45        hester_gray_garden         4   38       2   all
## 46        green_gabl_kitchen         4   38       2   all
## 47          uncl_abe_predict         4   38       1   all
## 48             east_gabl_ann         4   38       2   all
## 49            uncl_abe_storm         4   38       1   all
## 50       marri_gilbert_blyth         4   38       3   all
anne_corpus_tokens_alt %>% dfm() %>% dfm_wordstem() %>% topfeatures(50)
##      ann  marilla    diana     time  gilbert     girl      day     love 
##     3984     1307      795      685      656      594      558      541 
##      eye     feel     hous     davi  matthew     life     told     live 
##      484      454      446      419      392      385      383      382 
##     lynd    night    lesli    white     talk  captain    peopl    green 
##      373      371      365      324      324      318      316      316 
##      jim     hand    marri     look    suppo   school   beauti     mind 
##      313      309      308      306      305      302      296      282 
##     even    laugh     gabl    dream    heart   imagin cornelia   mother 
##      282      280      269      259      259      258      258      250 
##   friend     head      sit     jane     hair  chapter      boy     walk 
##      248      245      242      242      241      240      240      239 
##  avonlea     hope 
##      237      236
# Compare with original
anne_corpus_tokens %>% dfm() %>% dfm_wordstem() %>% topfeatures( 50 )
##     ann    said marilla      go    like     one     mrs   never    just   littl 
##    3984    2331    1307    1302    1296    1098    1072    1060    1033    1031 
##   think    know     say    look    come    miss    well   diana   thing      oh 
##    1019     977     927     915     859     817     806     795     767     740 
##     see     old    good    time gilbert     now     get    even    much    girl 
##     705     692     688     685     656     637     611     610     606     594 
## thought    want     can     day    ever    love    home      mr    came    make 
##     593     587     586     558     544     541     540     529     520     509 
##    went    must     eye   alway    year    tell    back    seem     ask    feel 
##     501     491     484     483     481     472     464     463     456     454

Marvel

Data Source : https://www.kaggle.com/datasets/phiitm/marvel-cinematic-universe-dialogue-dataset

marvel_script_text <- readtext(here::here("data/marvel/*"))
marvel_corpus <- corpus(marvel_script_text, docid_field="doc_id", text_field="text" )

marvel_corpus 
## Corpus consisting of 23 documents.
## Ant-Man.And.The.Wasp.txt :
## "I still think about the night your mother and I had to leave..."
## 
## Ant-Man.txt :
## "Stark! He doesn't seem happy. Hello, Hank. You're supposed t..."
## 
## Avengers.Age.of.Ultron.txt :
## "(DISTANT EXPLOSION) STRUCKER ON PA: Report to your stations ..."
## 
## Avengers.Endgame.txt :
## "Okay, hold on, don't shoot. - You see where you're going? - ..."
## 
## Avengers.Infinity.War.txt :
## "This is the Asgardian refugee vessel Statesman. We are under..."
## 
## Avengers.txt :
## "The Tesseract has awakened. It is on a little world, a human..."
## 
## [ reached max_ndoc ... 17 more documents ]
marvel_corpus[1]
## Corpus consisting of 1 document.
## Ant-Man.And.The.Wasp.txt :
## "I still think about the night your mother and I had to leave..."
# summarize corpus
summary(marvel_corpus)
## Corpus consisting of 23 documents, showing 23 documents:
## 
##                                    Text Types Tokens Sentences
##                Ant-Man.And.The.Wasp.txt  1979  13654      1913
##                             Ant-Man.txt  2051  12139      1603
##              Avengers.Age.of.Ultron.txt  2410  15275      1812
##                    Avengers.Endgame.txt  2218  15866      2170
##               Avengers.Infinity.War.txt  1919  11879      1637
##                            Avengers.txt  2269  12623      1654
##                       Black.Panther.txt  1743  10308      1420
##           Captain.America.Civil.War.txt  2499  14502      1693
##   Captain.America.The.First.Avenger.txt  1917   9336      1199
##  Captain.America.The.Winter.Soldier.txt  2219  10817      1406
##                      Captain.Marvel.txt  1762   9646      1386
##                      Doctor.Strange.txt  1698   8907      1219
##             Guardians.of.the.Galaxy.txt  1839  10201      1258
##      Guardians.of.the.Galaxy.Vol. 2.txt  1904  12031      1479
##                          Iron-Man.2.txt  2291  14985      1538
##                          Iron-Man.3.txt  2668  17136      1972
##                            Iron-Man.txt  2176  12979      1330
##            Spider-Man.Far.From.Home.txt  2081  14453      2248
##               Spider-Man.Homecoming.txt  2203  15847      2494
##                 The.Incredible.Hulk.txt  1284   6375       738
##                       Thor.Ragnarok.txt  1871  11815      1697
##                 Thor.The.Dark.World.txt  1550   7943      1063
##                                Thor.txt  1642   8711      1026
# remove punctuation 
marvel_corpus_tokens_orig <- tokens( marvel_corpus, what="word", remove_punct=TRUE )
head( marvel_corpus_tokens_orig )
## Tokens consisting of 6 documents.
## Ant-Man.And.The.Wasp.txt :
##  [1] "I"      "still"  "think"  "about"  "the"    "night"  "your"   "mother"
##  [9] "and"    "I"      "had"    "to"    
## [ ... and 10,179 more ]
## 
## Ant-Man.txt :
##  [1] "Stark"    "He"       "doesn't"  "seem"     "happy"    "Hello"   
##  [7] "Hank"     "You're"   "supposed" "to"       "be"       "in"      
## [ ... and 9,363 more ]
## 
## Avengers.Age.of.Ultron.txt :
##  [1] "DISTANT"     "EXPLOSION"   "STRUCKER"    "ON"          "PA"         
##  [6] "Report"      "to"          "your"        "stations"    "immediately"
## [11] "This"        "is"         
## [ ... and 11,180 more ]
## 
## Avengers.Endgame.txt :
##  [1] "Okay"   "hold"   "on"     "don't"  "shoot"  "You"    "see"    "where" 
##  [9] "you're" "going"  "Mm-hmm" "Okay"  
## [ ... and 11,712 more ]
## 
## Avengers.Infinity.War.txt :
##  [1] "This"      "is"        "the"       "Asgardian" "refugee"   "vessel"   
##  [7] "Statesman" "We"        "are"       "under"     "assault"   "I"        
## [ ... and 8,907 more ]
## 
## Avengers.txt :
##  [1] "The"       "Tesseract" "has"       "awakened"  "It"        "is"       
##  [7] "on"        "a"         "little"    "world"     "a"         "human"    
## [ ... and 10,109 more ]
# convert to lower case
marvel_corpus_tokens_orig <- tokens_tolower( marvel_corpus_tokens_orig, keep_acronyms=TRUE )
head( marvel_corpus_tokens_orig )
## Tokens consisting of 6 documents.
## Ant-Man.And.The.Wasp.txt :
##  [1] "i"      "still"  "think"  "about"  "the"    "night"  "your"   "mother"
##  [9] "and"    "i"      "had"    "to"    
## [ ... and 10,179 more ]
## 
## Ant-Man.txt :
##  [1] "stark"    "he"       "doesn't"  "seem"     "happy"    "hello"   
##  [7] "hank"     "you're"   "supposed" "to"       "be"       "in"      
## [ ... and 9,363 more ]
## 
## Avengers.Age.of.Ultron.txt :
##  [1] "DISTANT"     "EXPLOSION"   "STRUCKER"    "ON"          "PA"         
##  [6] "report"      "to"          "your"        "stations"    "immediately"
## [11] "this"        "is"         
## [ ... and 11,180 more ]
## 
## Avengers.Endgame.txt :
##  [1] "okay"   "hold"   "on"     "don't"  "shoot"  "you"    "see"    "where" 
##  [9] "you're" "going"  "mm-hmm" "okay"  
## [ ... and 11,712 more ]
## 
## Avengers.Infinity.War.txt :
##  [1] "this"      "is"        "the"       "asgardian" "refugee"   "vessel"   
##  [7] "statesman" "we"        "are"       "under"     "assault"   "i"        
## [ ... and 8,907 more ]
## 
## Avengers.txt :
##  [1] "the"       "tesseract" "has"       "awakened"  "it"        "is"       
##  [7] "on"        "a"         "little"    "world"     "a"         "human"    
## [ ... and 10,109 more ]
marvel_corpus_tokens <- tokens_remove( marvel_corpus_tokens_orig, c( stopwords("english"), "nbsp" ), padding=F )

head(marvel_corpus_tokens)
## Tokens consisting of 6 documents.
## Ant-Man.And.The.Wasp.txt :
##  [1] "still"      "think"      "night"      "mother"     "leave"     
##  [6] "hopefully"  "long"       "call"       "get"        "settled"   
## [11] "better"     "indication"
## [ ... and 4,924 more ]
## 
## Ant-Man.txt :
##  [1] "stark"    "seem"     "happy"    "hello"    "hank"     "supposed"
##  [7] "moscow"   "took"     "detour"   "defense"  "lab"      "tell"    
## [ ... and 4,480 more ]
## 
## Avengers.Age.of.Ultron.txt :
##  [1] "DISTANT"      "EXPLOSION"    "STRUCKER"     "PA"           "report"      
##  [6] "stations"     "immediately"  "drill"        "attack"       "SOLDIERS"    
## [11] "SHOUTING"     "INDISTINCTLY"
## [ ... and 5,717 more ]
## 
## Avengers.Endgame.txt :
##  [1] "okay"   "hold"   "shoot"  "see"    "going"  "mm-hmm" "okay"   "now"   
##  [9] "worry"  "get"    "gotta"  "move"  
## [ ... and 5,533 more ]
## 
## Avengers.Infinity.War.txt :
##  [1] "asgardian" "refugee"   "vessel"    "statesman" "assault"   "repeat"   
##  [7] "assault"   "engines"   "dead"      "life"      "support"   "failing"  
## [ ... and 4,127 more ]
## 
## Avengers.txt :
##  [1] "tesseract" "awakened"  "little"    "world"     "human"     "world"    
##  [7] "wield"     "power"     "ally"      "knows"     "workings"  "never"    
## [ ... and 4,696 more ]
marvel_corpus_tokens <- tokens_wordstem( marvel_corpus_tokens )
marvel_corpus_tokens
## Tokens consisting of 23 documents.
## Ant-Man.And.The.Wasp.txt :
##  [1] "still"  "think"  "night"  "mother" "leav"   "hope"   "long"   "call"  
##  [9] "get"    "settl"  "better" "indic" 
## [ ... and 4,924 more ]
## 
## Ant-Man.txt :
##  [1] "stark"  "seem"   "happi"  "hello"  "hank"   "suppos" "moscow" "took"  
##  [9] "detour" "defens" "lab"    "tell"  
## [ ... and 4,480 more ]
## 
## Avengers.Age.of.Ultron.txt :
##  [1] "DISTANT"      "EXPLOSION"    "STRUCKER"     "PA"           "report"      
##  [6] "station"      "immedi"       "drill"        "attack"       "SOLDIERS"    
## [11] "SHOUTING"     "INDISTINCTLi"
## [ ... and 5,717 more ]
## 
## Avengers.Endgame.txt :
##  [1] "okay"   "hold"   "shoot"  "see"    "go"     "mm-hmm" "okay"   "now"   
##  [9] "worri"  "get"    "gotta"  "move"  
## [ ... and 5,533 more ]
## 
## Avengers.Infinity.War.txt :
##  [1] "asgardian" "refuge"    "vessel"    "statesman" "assault"   "repeat"   
##  [7] "assault"   "engin"     "dead"      "life"      "support"   "fail"     
## [ ... and 4,127 more ]
## 
## Avengers.txt :
##  [1] "tesseract" "awaken"    "littl"     "world"     "human"     "world"    
##  [7] "wield"     "power"     "alli"      "know"      "work"      "never"    
## [ ... and 4,696 more ]
## 
## [ reached max_ndoc ... 17 more documents ]
# find frequently co-occuring words (typically compound words)
marvel_corpus_ngram <- tokens_ngrams( marvel_corpus_tokens, n=2 ) %>% dfm()
marvel_corpus_ngram  %>% textstat_frequency( n=50 )
##            feature frequency rank docfreq group
## 1           oh_god       117    1      21   all
## 2         mr_stark       102    2      11   all
## 3        right_now        97    3      21   all
## 4        look_like        87    4      21   all
## 5            go_go        86    5      19   all
## 6          hey_hey        66    6      12   all
## 7         iron_man        59    7      11   all
## 8        okay_okay        56    8      14   all
## 9         get_back        54    9      18   all
## 10       yeah_yeah        53   10      15   all
## 11      toni_stark        52   11       9   all
## 12       wait_wait        50   12      12   all
## 13       come_come        46   13      16   all
## 14       come_back        44   14      17   all
## 15       know_know        44   14      19   all
## 16 captain_america        41   16       8   all
## 17        year_ago        39   17      17   all
## 18         can_get        38   18      15   all
## 19         go_back        36   19      16   all
## 20        new_york        36   19      14   all
## 21         oh_yeah        35   21      17   all
## 22         can_see        35   21      15   all
## 23       need_help        34   23      17   all
## 24       yeah_well        33   24      17   all
## 25       even_know        32   25      20   all
## 26          go_get        32   25      16   all
## 27      gonna_need        32   25      17   all
## 28          let_go        32   25      14   all
## 29       just_like        31   29      17   all
## 30        gonna_go        30   30      13   all
## 31         yes_sir        30   30      13   all
## 32       feel_like        28   32      13   all
## 33      sound_like        28   32      15   all
## 34       yeah_know        28   32      15   all
## 35       one_thing        28   32      17   all
## 36       make_sure        28   32      15   all
## 37       move_move        28   32       9   all
## 38       okay_yeah        28   32      12   all
## 39     right_right        27   39      15   all
## 40        need_get        26   40      13   all
## 41       just_need        26   40      18   all
## 42       just_want        26   40      16   all
## 43        can_help        26   40      17   all
## 44      right_yeah        26   40      12   all
## 45        gotta_go        25   45       6   all
## 46         two_one        25   45      11   all
## 47      bring_back        24   47      10   all
## 48        can_take        24   47      14   all
## 49         tell_us        24   47      13   all
## 50       three_two        24   47      11   all
# find frequently co-occuring words (typically compound words)
marvel_corpus_ngram3 <- tokens_ngrams( marvel_corpus_tokens, n=3 ) %>% dfm()
marvel_corpus_ngram3  %>% textstat_frequency( n=50 )
##                  feature frequency rank docfreq group
## 1               go_go_go        33    1      10   all
## 2         wait_wait_wait        24    2      11   all
## 3          three_two_one        22    3      10   all
## 4            hey_hey_hey        20    4       9   all
## 5         yeah_yeah_yeah        15    5       8   all
## 6         okay_okay_okay        12    6       6   all
## 7         come_come_come        10    7       5   all
## 8         well_left_hand        10    7       1   all
## 9         left_hand_free        10    7       1   all
## 10           oh_god_okay         9   10       6   all
## 11             oh_god_oh         8   11       4   all
## 12        move_move_move         8   11       4   all
## 13          hand_free_oh         8   11       1   all
## 14       hail_hydra_hail         8   11       1   all
## 15      hydra_hail_hydra         8   11       1   all
## 16          sea_bass_sea         8   11       1   all
## 17         bass_sea_bass         8   11       1   all
## 18            god_oh_god         7   18       4   all
## 19       five_four_three         7   18       5   all
## 20        whoa_whoa_whoa         7   18       6   all
## 21         five_year_ago         7   18       4   all
## 22          oh_well_left         7   18       1   all
## 23        can_still_hear         7   18       2   all
## 24           hey_big_guy         6   24       3   all
## 25         thor_son_odin         6   24       3   all
## 26     pleas_pleas_pleas         6   24       2   all
## 27        okay_yeah_yeah         6   24       5   all
## 28          free_oh_well         6   24       1   all
## 29          six_year_ago         6   24       3   all
## 30        love_now_never         6   24       1   all
## 31        now_never_love         6   24       1   all
## 32        never_love_can         6   24       1   all
## 33        love_can_still         6   24       1   all
## 34        still_hear_say         6   24       1   all
## 35        hear_say_never         6   24       1   all
## 36       say_never_break         6   24       1   all
## 37     never_break_chain         6   24       1   all
## 38 watch_beeping_rapidli         6   24       1   all
## 39        know_feel_like         5   39       3   all
## 40         look_like_got         5   39       5   all
## 41        four_three_two         5   39       5   all
## 42          oh_thank_god         5   39       5   all
## 43          hey_mr_stark         5   39       4   all
## 44 dormammu_come_bargain         5   39       1   all
## 45        toni_toni_toni         5   39       3   all
## 46    colonel_jame_rhode         5   39       2   all
## 47        feel_like_know         5   39       2   all
## 48          go_right_now         4   48       4   all
## 49           yes_yes_yes         4   48       2   all
## 50        gotta_go_gotta         4   48       4   all
marvel_corpus_tokens %>% dfm() %>% dfm_wordstem() %>% topfeatures( 50 )
##   know     go    get   just  right   like   yeah   come    can    one    got 
##   1312   1179   1054   1019    988    890    870    834    802    773    758 
##    now   okay  gonna     oh   want   need   look  think    hey   time   back 
##    737    707    647    627    609    608    599    586    558    548    532 
##    see   well     us   take   good  thing    guy    man   tell  thank    say 
##    521    517    503    495    488    447    445    413    391    381    377 
##    yes   make  stark   call   toni realli    way   work someth  sorri  peopl 
##    371    365    361    353    349    336    335    329    315    314    313 
##   kill    tri   help  never  littl   give 
##    300    293    286    281    276    274

Marvel Alternate Stopwords

marvel_corpus_tokens_alt <- tokens_remove( marvel_corpus_tokens_orig, c(stopwords::stopwords(source = "stopwords-iso"), "nbsp" ), padding=F )

marvel_corpus_tokens_alt
## Tokens consisting of 23 documents.
## Ant-Man.And.The.Wasp.txt :
##  [1] "night"       "mother"      "leave"       "settled"     "indication" 
##  [6] "mommy"       "jellybean"   "daddy"       "last-minute" "business"   
## [11] "trip"        "rose"       
## [ ... and 2,786 more ]
## 
## Ant-Man.txt :
##  [1] "stark"     "happy"     "hank"      "supposed"  "moscow"    "detour"   
##  [7] "defense"   "lab"       "depends"   "poor"      "attempt"   "replicate"
## [ ... and 2,694 more ]
## 
## Avengers.Age.of.Ultron.txt :
##  [1] "DISTANT"      "EXPLOSION"    "STRUCKER"     "report"       "stations"    
##  [6] "drill"        "attack"       "SOLDIERS"     "SHOUTING"     "INDISTINCTLY"
## [11] "attack"       "GRUNTS"      
## [ ... and 3,672 more ]
## 
## Avengers.Endgame.txt :
##  [1] "hold"    "shoot"   "mm-hmm"  "worry"   "gotta"   "foot"    "toe"    
##  [8] "hips"    "yeah"    "mm-hmm"  "ready"   "fingers"
## [ ... and 3,057 more ]
## 
## Avengers.Infinity.War.txt :
##  [1] "asgardian" "refugee"   "vessel"    "statesman" "assault"   "repeat"   
##  [7] "assault"   "engines"   "dead"      "life"      "support"   "failing"  
## [ ... and 2,498 more ]
## 
## Avengers.txt :
##  [1] "tesseract" "awakened"  "human"     "wield"     "power"     "ally"     
##  [7] "workings"  "ready"     "lead"      "force"     "chitauri"  "follow"   
## [ ... and 2,883 more ]
## 
## [ reached max_ndoc ... 17 more documents ]
marvel_corpus_tokens_alt <- tokens_wordstem( marvel_corpus_tokens_alt )
marvel_corpus_tokens_alt
## Tokens consisting of 23 documents.
## Ant-Man.And.The.Wasp.txt :
##  [1] "night"      "mother"     "leav"       "settl"      "indic"     
##  [6] "mommi"      "jellybean"  "daddi"      "last-minut" "busi"      
## [11] "trip"       "rose"      
## [ ... and 2,786 more ]
## 
## Ant-Man.txt :
##  [1] "stark"   "happi"   "hank"    "suppos"  "moscow"  "detour"  "defens" 
##  [8] "lab"     "depend"  "poor"    "attempt" "replic" 
## [ ... and 2,694 more ]
## 
## Avengers.Age.of.Ultron.txt :
##  [1] "DISTANT"      "EXPLOSION"    "STRUCKER"     "report"       "station"     
##  [6] "drill"        "attack"       "SOLDIERS"     "SHOUTING"     "INDISTINCTLi"
## [11] "attack"       "GRUNTS"      
## [ ... and 3,672 more ]
## 
## Avengers.Endgame.txt :
##  [1] "hold"   "shoot"  "mm-hmm" "worri"  "gotta"  "foot"   "toe"    "hip"   
##  [9] "yeah"   "mm-hmm" "readi"  "finger"
## [ ... and 3,057 more ]
## 
## Avengers.Infinity.War.txt :
##  [1] "asgardian" "refuge"    "vessel"    "statesman" "assault"   "repeat"   
##  [7] "assault"   "engin"     "dead"      "life"      "support"   "fail"     
## [ ... and 2,498 more ]
## 
## Avengers.txt :
##  [1] "tesseract" "awaken"    "human"     "wield"     "power"     "alli"     
##  [7] "work"      "readi"     "lead"      "forc"      "chitauri"  "follow"   
## [ ... and 2,883 more ]
## 
## [ reached max_ndoc ... 17 more documents ]
# find frequently co-occuring words (typically compound words)
marvel_corpus_ngram_alt <- tokens_ngrams( marvel_corpus_tokens_alt, n=2 ) %>% dfm()
marvel_corpus_ngram_alt  %>% textstat_frequency( n=50 )
##                  feature frequency rank docfreq group
## 1                hey_hey        78    1      12   all
## 2              yeah_yeah        63    2      16   all
## 3              wait_wait        54    3      13   all
## 4             toni_stark        53    4       9   all
## 5        captain_america        41    5       8   all
## 6                hey_guy        29    6      14   all
## 7              toni_toni        24    7       5   all
## 8            gonna_gonna        20    8      14   all
## 9              save_life        20    8      17   all
## 10      people_screaming        20    8       3   all
## 11           infin_stone        20    8       8   all
## 12         captain_roger        20    8       4   all
## 13            yeah_gonna        18   13       9   all
## 14 spider-man_spider-man        18   13       2   all
## 15             gonna_die        17   15      11   all
## 16             whoa_whoa        17   15       8   all
## 17             nick_furi        17   15       7   all
## 18              hank_pym        16   18       4   all
## 19            gonna_kill        16   18       9   all
## 20        stark_industri        16   18       3   all
## 21         quantum_realm        15   21       3   all
## 22             son_bitch        15   21      12   all
## 23            hail_hydra        15   21       4   all
## 24   speaking_portuguese        15   21       1   all
## 25              yeah_hey        14   25       7   all
## 26             yeah_time        13   26       9   all
## 27               uh_yeah        13   26       8   all
## 28               bad_guy        13   26       9   all
## 29             love_love        13   26       6   all
## 30         music_playing        13   26       5   all
## 31           dark_dimens        13   26       1   all
## 32             nova_corp        13   26       2   all
## 33             bird_bird        13   26       2   all
## 34          night_monkey        13   26       1   all
## 35              sea_bass        13   26       1   all
## 36           truth_serum        12   36       1   all
## 37             time_time        12   36       8   all
## 38            wait_minut        12   36       9   all
## 39              yeah_guy        12   36       8   all
## 40       grunting_groans        12   36       3   all
## 41             lot_peopl        12   36       8   all
## 42          peter_parker        12   36       3   all
## 43        agent_romanoff        12   36       3   all
## 44             plan_plan        12   36       5   all
## 45         black_panther        12   36       2   all
## 46         watch_beeping        12   36       3   all
## 47             left_hand        12   36       1   all
## 48                dr_pym        11   48       2   all
## 49        ladi_gentlemen        11   48       6   all
## 50               yeah_uh        11   48       8   all
# find frequently co-occuring words (typically compound words)
marvel_corpus_ngram3_alt <- tokens_ngrams( marvel_corpus_tokens_alt, n=3 ) %>% dfm()
marvel_corpus_ngram3_alt  %>% textstat_frequency( n=50 )
##                             feature frequency rank docfreq group
## 1                       hey_hey_hey        26    1      10   all
## 2                    wait_wait_wait        24    2      11   all
## 3                    yeah_yeah_yeah        16    3       8   all
## 4                      sea_bass_sea         9    4       1   all
## 5                     bass_sea_bass         9    4       1   all
## 6                   hail_hydra_hail         8    6       1   all
## 7                  hydra_hail_hydra         8    6       1   all
## 8                    whoa_whoa_whoa         7    8       6   all
## 9                    bird_bird_bird         7    8       1   all
## 10                    thor_son_odin         6   10       3   all
## 11                   left_hand_left         6   10       1   all
## 12                   hand_left_hand         6   10       1   all
## 13                   love_love_hear         6   10       1   all
## 14                  love_hear_break         6   10       1   all
## 15                 hear_break_chain         6   10       1   all
## 16                   toni_toni_toni         6   10       4   all
## 17 spider-man_spider-man_spider-man         6   10       2   all
## 18            watch_beeping_rapidli         6   10       1   all
## 19               colonel_jame_rhode         5   19       2   all
## 20                  toni_stark_toni         5   19       2   all
## 21               night_monkey_night         5   19       1   all
## 22              monkey_night_monkey         5   19       1   all
## 23               concept_time_space         4   23       1   all
## 24            doctor_stephen_strang         4   23       3   all
## 25                   drax_drax_drax         4   23       2   all
## 26   friend_neighborhood_spider-man         4   23       3   all
## 27             prais_ancestor_prais         4   23       1   all
## 28          ancestor_prais_ancestor         4   23       1   all
## 29            mission_report_decemb         4   23       1   all
## 30                 report_decemb_16         4   23       1   all
## 31                   decemb_16_1991         4   23       1   all
## 32                  peopl_gonna_die         4   23       3   all
## 33                  draw_power_dark         4   23       1   all
## 34                power_dark_dimens         4   23       1   all
## 35             strang_doctor_strang         4   23       2   all
## 36                    tea_drink_tea         4   23       2   all
## 37                 break_chain_love         4   23       1   all
## 38                  chain_love_love         4   23       1   all
## 39             lieuten_colonel_jame         4   23       1   all
## 40               ceo_stark_industri         4   23       2   all
## 41                latin_speak_latin         4   23       1   all
## 42                 toni_stark_trust         4   23       1   all
## 43                night_monkey_yeah         4   23       1   all
## 44          thunder_thunder_thunder         4   23       1   all
## 45                   dr_erik_selvig         4   23       2   all
## 46                    hope_van_dyne         3   46       1   all
## 47                     pym_van_dyne         3   46       1   all
## 48               time_space_irrelev         3   46       1   all
## 49                      hey_guy_sun         3   46       2   all
## 50                     guy_sun_real         3   46       2   all
marvel_corpus_tokens_alt %>% dfm() %>% dfm_wordstem() %>% topfeatures(50)
##    yeah   gonna     hey    time     guy   stark    toni   peopl    kill    wait 
##     870     647     558     548     445     361     349     313     300     269 
##   power     sir    talk     god    suit    love    life     day  friend   grunt 
##     245     244     227     218     192     185     185     182     180     171 
##  happen  father   stone    come   gotta   peter    hand     die    head     kid 
##     170     167     167     166     164     162     161     160     154     152 
##  weapon    call     lot    told    stay    thor    live    feel    save captain 
##     151     150     146     146     145     145     143     143     142     142 
##   groan    plan      uh   start   wanna    hope    leav    fine    nice   bring 
##     142     133     133     132     130     128     127     126     125     124
# Compare with original
marvel_corpus_tokens %>% dfm() %>% dfm_wordstem() %>% topfeatures( 50 )
##   know     go    get   just  right   like   yeah   come    can    one    got 
##   1312   1179   1054   1019    988    890    870    834    802    773    758 
##    now   okay  gonna     oh   want   need   look  think    hey   time   back 
##    737    707    647    627    609    608    599    586    558    548    532 
##    see   well     us   take   good  thing    guy    man   tell  thank    say 
##    521    517    503    495    488    447    445    413    391    381    377 
##    yes   make  stark   call   toni realli    way   work someth  sorri  peopl 
##    371    365    361    353    349    336    335    329    315    314    313 
##   kill    tri   help  never  littl   give 
##    300    293    286    281    276    274

Marvel Alternate Stopwords II

marvel_corpus_tokens_alt <- tokens_remove( marvel_corpus_tokens_orig, c(cleanNLP::word_frequency$word[1:5000], contractions, "nbsp" ), padding=F )

marvel_corpus_tokens_alt
## Tokens consisting of 23 documents.
## Ant-Man.And.The.Wasp.txt :
##  [1] "hopefully"   "settled"     "indication"  "mommy"       "jellybean"  
##  [6] "daddy"       "last-minute" "ugh"         "boring"      "goodbye"    
## [11] "sweetheart"  "janet"      
## [ ... and 1,194 more ]
## 
## Ant-Man.txt :
##  [1] "stark"      "hank"       "moscow"     "detour"     "replicate" 
##  [6] "nerve"      "instructed" "remind"     "pym"        "soldier"   
## [11] "scientist"  "pym"       
## [ ... and 1,148 more ]
## 
## Avengers.Age.of.Ultron.txt :
##  [1] "DISTANT"      "EXPLOSION"    "STRUCKER"     "drill"        "SHOUTING"    
##  [6] "INDISTINCTLY" "GRUNTS"       "POWERING"     "YELLS"        "GRUNTING"    
## [11] "SCREAMING"    "ROARING"     
## [ ... and 1,831 more ]
## 
## Avengers.Endgame.txt :
##  [1] "shoot"   "mm-hmm"  "worry"   "gotta"   "hips"    "mm-hmm"  "fingers"
##  [8] "kiddo"   "mayo"    "mustard" "puts"    "mayo"   
## [ ... and 1,223 more ]
## 
## Avengers.Infinity.War.txt :
##  [1] "asgardian"  "refugee"    "vessel"     "statesman"  "assault"   
##  [6] "assault"    "failing"    "requesting" "vessel"     "22"        
## [11] "asgard"     "asgardian" 
## [ ... and 1,121 more ]
## 
## Avengers.txt :
##  [1] "tesseract"  "awakened"   "wield"      "ally"       "workings"  
##  [6] "chitauri"   "humans"     "evacuation" "selvig"     "surge"     
## [11] "tesseract"  "authorise" 
## [ ... and 1,239 more ]
## 
## [ reached max_ndoc ... 17 more documents ]
marvel_corpus_tokens_alt <- tokens_wordstem( marvel_corpus_tokens_alt )
marvel_corpus_tokens_alt
## Tokens consisting of 23 documents.
## Ant-Man.And.The.Wasp.txt :
##  [1] "hope"       "settl"      "indic"      "mommi"      "jellybean" 
##  [6] "daddi"      "last-minut" "ugh"        "bore"       "goodby"    
## [11] "sweetheart" "janet"     
## [ ... and 1,194 more ]
## 
## Ant-Man.txt :
##  [1] "stark"     "hank"      "moscow"    "detour"    "replic"    "nerv"     
##  [7] "instruct"  "remind"    "pym"       "soldier"   "scientist" "pym"      
## [ ... and 1,148 more ]
## 
## Avengers.Age.of.Ultron.txt :
##  [1] "DISTANT"      "EXPLOSION"    "STRUCKER"     "drill"        "SHOUTING"    
##  [6] "INDISTINCTLi" "GRUNTS"       "POWERING"     "YELLS"        "GRUNTING"    
## [11] "SCREAMING"    "ROARING"     
## [ ... and 1,831 more ]
## 
## Avengers.Endgame.txt :
##  [1] "shoot"   "mm-hmm"  "worri"   "gotta"   "hip"     "mm-hmm"  "finger" 
##  [8] "kiddo"   "mayo"    "mustard" "put"     "mayo"   
## [ ... and 1,223 more ]
## 
## Avengers.Infinity.War.txt :
##  [1] "asgardian" "refuge"    "vessel"    "statesman" "assault"   "assault"  
##  [7] "fail"      "request"   "vessel"    "22"        "asgard"    "asgardian"
## [ ... and 1,121 more ]
## 
## Avengers.txt :
##  [1] "tesseract" "awaken"    "wield"     "alli"      "work"      "chitauri" 
##  [7] "human"     "evacu"     "selvig"    "surg"      "tesseract" "authoris" 
## [ ... and 1,239 more ]
## 
## [ reached max_ndoc ... 17 more documents ]
# find frequently co-occuring words (typically compound words)
marvel_corpus_ngram_alt <- tokens_ngrams( marvel_corpus_tokens_alt, n=2 ) %>% dfm()
marvel_corpus_ngram_alt  %>% textstat_frequency( n=50 )
##                     feature frequency rank docfreq group
## 1               stark_stark        30    1       8   all
## 2     spider-man_spider-man        25    2       2   all
## 3                 whoa_whoa        17    3       8   all
## 4                  hank_pym        16    4       4   all
## 5           grunting_groans        16    4       3   all
## 6             quantum_realm        15    6       3   all
## 7                 hulk_hulk        15    6       5   all
## 8               thano_thano        15    6       3   all
## 9                hail_hydra        15    6       4   all
## 10            breath_breath        13   10       4   all
## 11            colonel_rhode        13   10       4   all
## 12              infin_stone        12   12       5   all
## 13              gotta_gotta        11   13       9   all
## 14                thor_thor        11   13       7   all
## 15            hammer_hammer        11   13       6   all
## 16              groot_groot        11   13       3   all
## 17            grunts_groans        10   17       3   all
## 18           sirens_wailing        10   17       4   all
## 19                furi_furi        10   17       4   all
## 20              arc_reactor        10   17       3   all
## 21            monkey_monkey        10   17       2   all
## 22        t'challa_t'challa        10   17       1   all
## 23              helmut_zemo        10   17       1   all
## 24       indistinct_chatter         9   24       3   all
## 25                loki_loki         9   24       4   all
## 26                drax_drax         9   24       2   all
## 27 indistinct_conversations         8   27       2   all
## 28                dude_dude         8   27       2   all
## 29            asgard_asgard         8   27       4   all
## 30               hydra_hail         8   27       1   all
## 31              frost_giant         8   27       1   all
## 32                hurt_hurt         7   32       6   all
## 33              wanna_wanna         7   32       5   all
## 34        breathing_heavili         7   32       3   all
## 35                thor_odin         7   32       4   all
## 36              aveng_aveng         7   32       5   all
## 37              stone_stone         7   32       3   all
## 38                hand_hand         7   32       2   all
## 39              erik_selvig         7   32       3   all
## 40          heart-shap_herb         7   32       1   all
## 41           prais_ancestor         7   32       1   all
## 42            sergeant_barn         7   32       4   all
## 43          lieuten_colonel         7   32       3   all
## 44                  ver_ver         7   32       1   all
## 45              drone_drone         7   32       2   all
## 46         stark_internship         7   32       1   all
## 47          beeping_rapidli         7   32       1   all
## 48    indistinct_chattering         7   32       1   all
## 49          heimdal_heimdal         7   32       2   all
## 50          thunder_thunder         7   32       1   all
# find frequently co-occuring words (typically compound words)
marvel_corpus_ngram3_alt <- tokens_ngrams( marvel_corpus_tokens_alt, n=3 ) %>% dfm()
marvel_corpus_ngram3_alt  %>% textstat_frequency( n=50 )
##                             feature frequency rank docfreq group
## 1  spider-man_spider-man_spider-man        11    1       2   all
## 2                 stark_stark_stark        10    2       5   all
## 3                   hail_hydra_hail         8    3       1   all
## 4                  hydra_hail_hydra         8    3       1   all
## 5                    whoa_whoa_whoa         7    5       6   all
## 6        t'challa_t'challa_t'challa         7    5       1   all
## 7                    dude_dude_dude         6    7       2   all
## 8                    hulk_hulk_hulk         6    7       2   all
## 9              monkey_monkey_monkey         6    7       1   all
## 10                   drax_drax_drax         5   10       2   all
## 11             breath_breath_breath         5   10       2   all
## 12            lieuten_colonel_rhode         5   10       2   all
## 13                toast_toast_toast         5   10       1   all
## 14             prais_ancestor_prais         4   14       1   all
## 15          ancestor_prais_ancestor         4   14       1   all
## 16                   hand_hand_hand         4   14       1   all
## 17               quill_rocket_quill         4   14       2   all
## 18          thunder_thunder_thunder         4   14       1   all
## 19                    hank_pym_hank         3   19       2   all
## 20       recalibr_recalibr_recalibr         3   19       1   all
## 21             earth_mightiest_hero         3   19       3   all
## 22                stone_snap_finger         3   19       2   all
## 23                   lila_lila_lila         3   19       1   all
## 24                infin_stone_stone         3   19       2   all
## 25                   loki_loki_loki         3   19       2   all
## 26    kamar-taj_kamar-taj_kamar-taj         3   19       1   all
## 27          sanctum_sanctum_sanctum         3   19       1   all
## 28                drone_drone_drone         3   19       1   all
## 29           grunting_groans_grunts         3   19       1   all
## 30                      ned_liz_liz         3   19       1   all
## 31          beeping_rapidli_beeping         3   19       1   all
## 32            rapidli_beeping_slows         3   19       1   all
## 33             reveng_reveng_reveng         3   19       1   all
## 34              spear_thudding_thor         3   19       1   all
## 35             subatom_deactiv_bomb         2   35       2   all
## 36                anton_anton_anton         2   35       1   all
## 37                    hank_pym_dyne         2   35       1   all
## 38                    bye_daddi_bye         2   35       1   all
## 39                destroy_life_hank         2   35       1   all
## 40        paraponera_clavata_bullet         2   35       1   all
## 41               clavata_bullet_ant         2   35       1   all
## 42                gotta_darren_hank         2   35       1   all
## 43            entranc_exterior_vent         2   35       1   all
## 44                daddi_cassi_daddi         2   35       1   all
## 45        strucker_soldier_shouting         2   35       1   all
## 46              sooner_stark_sceptr         2   35       1   all
## 47              ultron_stark_jarvis         2   35       1   all
## 48              stark_ultron_jarvis         2   35       1   all
## 49              jarvis_stark_ultron         2   35       1   all
## 50               stark_rhodes_stark         2   35       1   all
marvel_corpus_tokens_alt %>% dfm() %>% dfm_wordstem() %>% topfeatures(50)
##       stark       grunt       gotta        thor       groan          uh 
##         361         171         164         145         142         133 
##       wanna      asgard        loki       jarvi        furi       thano 
##         130         122         115         104          94          94 
##      scream       aveng        whoa        damn s.h.i.e.l.d     destroy 
##          93          91          90          90          89          88 
##  spider-man       roger        hulk       quill      ultron       hydra 
##          86          82          80          79          78          76 
##        hank       worri        gasp      pepper       groot       stone 
##          74          72          72          69          69          68 
##       laugh      weapon     soldier         huh        shut       excus 
##          67          66          65          63          63          62 
##      cannot        hurt       realm        dude        odin     wakanda 
##          62          61          60          59          58          57 
##      hammer       ronan      breath    somebodi   tesseract       daddi 
##          56          54          53          50          50          49 
##        beep         ned 
##          49          48
# Compare with original
marvel_corpus_tokens %>% dfm() %>% dfm_wordstem() %>% topfeatures( 50 )
##   know     go    get   just  right   like   yeah   come    can    one    got 
##   1312   1179   1054   1019    988    890    870    834    802    773    758 
##    now   okay  gonna     oh   want   need   look  think    hey   time   back 
##    737    707    647    627    609    608    599    586    558    548    532 
##    see   well     us   take   good  thing    guy    man   tell  thank    say 
##    521    517    503    495    488    447    445    413    391    381    377 
##    yes   make  stark   call   toni realli    way   work someth  sorri  peopl 
##    371    365    361    353    349    336    335    329    315    314    313 
##   kill    tri   help  never  littl   give 
##    300    293    286    281    276    274

Marvel Alternate Stopwords III

marvel_corpus_tokens_alt <- tokens_remove( marvel_corpus_tokens_orig, c(cleanNLP::word_frequency$word[1:10000], contractions, "nbsp" ), padding=F )

marvel_corpus_tokens_alt
## Tokens consisting of 23 documents.
## Ant-Man.And.The.Wasp.txt :
##  [1] "mommy"       "jellybean"   "last-minute" "ugh"         "goodbye"    
##  [6] "sweetheart"  "tucked"      "disarm"      "plating"     "shrink"     
## [11] "hank"        "regulator"  
## [ ... and 703 more ]
## 
## Ant-Man.txt :
##  [1] "stark"      "hank"       "detour"     "replicate"  "instructed"
##  [6] "pym"        "pym"        "errand"     "ferocity"   "pym"       
## [11] "hank"       "ferocity"  
## [ ... and 709 more ]
## 
## Avengers.Age.of.Ultron.txt :
##  [1] "STRUCKER"     "SHOUTING"     "INDISTINCTLY" "GRUNTS"       "POWERING"    
##  [6] "YELLS"        "GRUNTING"     "SCREAMING"    "ROARING"      "jarvis"      
## [11] "upstairs"     "JARVIS"      
## [ ... and 1,336 more ]
## 
## Avengers.Endgame.txt :
##  [1] "mm-hmm"  "hips"    "mm-hmm"  "kiddo"   "mayo"    "mustard" "mayo"   
##  [8] "mustard" "mama"    "nate"    "mayo"    "mustard"
## [ ... and 751 more ]
## 
## Avengers.Infinity.War.txt :
##  [1] "asgardian"   "refugee"     "statesman"   "22"          "asgard"     
##  [6] "asgardian"   "rejoice"     "titan"       "thanos"      "desperately"
## [11] "nonetheless" "frightening"
## [ ... and 702 more ]
## 
## Avengers.txt :
##  [1] "tesseract"   "awakened"    "wield"       "ally"        "workings"   
##  [6] "chitauri"    "evacuation"  "selvig"      "tesseract"   "authorise"  
## [11] "selvig"      "spontaneous"
## [ ... and 856 more ]
## 
## [ reached max_ndoc ... 17 more documents ]
marvel_corpus_tokens_alt <- tokens_wordstem( marvel_corpus_tokens_alt )
marvel_corpus_tokens_alt
## Tokens consisting of 23 documents.
## Ant-Man.And.The.Wasp.txt :
##  [1] "mommi"      "jellybean"  "last-minut" "ugh"        "goodby"    
##  [6] "sweetheart" "tuck"       "disarm"     "plate"      "shrink"    
## [11] "hank"       "regul"     
## [ ... and 703 more ]
## 
## Ant-Man.txt :
##  [1] "stark"    "hank"     "detour"   "replic"   "instruct" "pym"     
##  [7] "pym"      "errand"   "feroc"    "pym"      "hank"     "feroc"   
## [ ... and 709 more ]
## 
## Avengers.Age.of.Ultron.txt :
##  [1] "STRUCKER"     "SHOUTING"     "INDISTINCTLi" "GRUNTS"       "POWERING"    
##  [6] "YELLS"        "GRUNTING"     "SCREAMING"    "ROARING"      "jarvi"       
## [11] "upstair"      "JARVIS"      
## [ ... and 1,336 more ]
## 
## Avengers.Endgame.txt :
##  [1] "mm-hmm"  "hip"     "mm-hmm"  "kiddo"   "mayo"    "mustard" "mayo"   
##  [8] "mustard" "mama"    "nate"    "mayo"    "mustard"
## [ ... and 751 more ]
## 
## Avengers.Infinity.War.txt :
##  [1] "asgardian"   "refuge"      "statesman"   "22"          "asgard"     
##  [6] "asgardian"   "rejoic"      "titan"       "thano"       "desper"     
## [11] "nonetheless" "frighten"   
## [ ... and 702 more ]
## 
## Avengers.txt :
##  [1] "tesseract" "awaken"    "wield"     "alli"      "work"      "chitauri" 
##  [7] "evacu"     "selvig"    "tesseract" "authoris"  "selvig"    "spontan"  
## [ ... and 856 more ]
## 
## [ reached max_ndoc ... 17 more documents ]
# find frequently co-occuring words (typically compound words)
marvel_corpus_ngram_alt <- tokens_ngrams( marvel_corpus_tokens_alt, n=2 ) %>% dfm()
marvel_corpus_ngram_alt  %>% textstat_frequency( n=50 )
##                    feature frequency rank docfreq group
## 1              stark_stark        42    1      10   all
## 2    spider-man_spider-man        28    2       2   all
## 3                hulk_hulk        19    3       5   all
## 4              thano_thano        18    4       4   all
## 5                whoa_whoa        17    5       8   all
## 6                furi_furi        17    5       4   all
## 7                 hank_pym        16    7       4   all
## 8          grunting_groans        16    7       3   all
## 9               hail_hydra        15    9       4   all
## 10               thor_thor        14   10       7   all
## 11               loki_loki        13   11       4   all
## 12             groot_groot        13   11       3   all
## 13           colonel_rhode        13   11       4   all
## 14           breath_breath        12   14       4   all
## 15       t'challa_t'challa        12   14       2   all
## 16           grunts_groans        11   16       3   all
## 17             quill_quill        11   16       3   all
## 18               drax_drax        11   16       2   all
## 19          sirens_wailing        10   19       4   all
## 20           asgard_asgard        10   19       4   all
## 21             helmut_zemo        10   19       1   all
## 22      indistinct_chatter         9   22       3   all
## 23             drone_drone         9   22       2   all
## 24             aveng_aveng         8   24       5   all
## 25              hydra_hail         8   24       1   all
## 26         beeping_beeping         8   24       2   all
## 27         malekith_aether         8   24       1   all
## 28                pym_hank         7   28       3   all
## 29             aveng_stark         7   28       6   all
## 30           groans_groans         7   28       2   all
## 31            stark_ultron         7   28       1   all
## 32         wakanda_wakanda         7   28       2   all
## 33               thor_odin         7   28       4   all
## 34               loki_thor         7   28       3   all
## 35         lieuten_colonel         7   28       3   all
## 36                 ver_ver         7   28       1   all
## 37             odin_asgard         7   28       3   all
## 38                 ned_ned         7   28       2   all
## 39   indistinct_chattering         7   28       1   all
## 40         heimdal_heimdal         7   28       2   all
## 41               baba_yaga         6   41       1   all
## 42      grunting_screaming         6   41       5   all
## 43           groans_grunts         6   41       2   all
## 44            ultron_stark         6   41       1   all
## 45           ultron_jarvis         6   41       1   all
## 46 s.h.i.e.l.d_s.h.i.e.l.d         6   41       4   all
## 47             swear_swear         6   41       4   all
## 48           grunts_grunts         6   41       2   all
## 49               hand_hand         6   41       1   all
## 50           lawson_lawson         6   41       1   all
# find frequently co-occuring words (typically compound words)
marvel_corpus_ngram3_alt <- tokens_ngrams( marvel_corpus_tokens_alt, n=3 ) %>% dfm()
marvel_corpus_ngram3_alt  %>% textstat_frequency( n=50 )
##                             feature frequency rank docfreq group
## 1  spider-man_spider-man_spider-man        11    1       2   all
## 2                 stark_stark_stark        10    2       5   all
## 3                    hulk_hulk_hulk         9    3       2   all
## 4        t'challa_t'challa_t'challa         9    3       1   all
## 5                   hail_hydra_hail         8    5       1   all
## 6                  hydra_hail_hydra         8    5       1   all
## 7                    whoa_whoa_whoa         7    7       6   all
## 8                    drax_drax_drax         7    7       2   all
## 9                    furi_furi_furi         6    9       2   all
## 10                   loki_loki_loki         5   10       2   all
## 11             breath_breath_breath         5   10       2   all
## 12            lieuten_colonel_rhode         5   10       2   all
## 13                toast_toast_toast         5   10       1   all
## 14                    hank_pym_hank         4   14       2   all
## 15              jarvis_stark_ultron         4   14       1   all
## 16                   hand_hand_hand         4   14       1   all
## 17    kamar-taj_kamar-taj_kamar-taj         4   14       1   all
## 18          sanctum_sanctum_sanctum         4   14       1   all
## 19                quill_quill_quill         4   14       2   all
## 20       recalibr_recalibr_recalibr         3   20       1   all
## 21                   lila_lila_lila         3   20       1   all
## 22             hydra_johann_schmidt         3   20       1   all
## 23                drone_drone_drone         3   20       1   all
## 24           grunting_groans_grunts         3   20       1   all
## 25            beeping_beeping_slows         3   20       1   all
## 26         malekith_aether_malekith         3   20       1   all
## 27              spear_thudding_thor         3   20       1   all
## 28                anton_anton_anton         2   28       1   all
## 29                    hank_pym_dyne         2   28       1   all
## 30                     pym_hank_pym         2   28       2   all
## 31                   hank_hank_hank         2   28       2   all
## 32           paraponera_clavata_ant         2   28       1   all
## 33             irrelev_shrink_etern         2   28       1   all
## 34               tripl_entranc_vent         2   28       1   all
## 35           whispers_groans_groans         2   28       2   all
## 36              sooner_stark_sceptr         2   28       1   all
## 37              ultron_stark_jarvis         2   28       1   all
## 38              stark_ultron_jarvis         2   28       1   all
## 39              ultron_jarvis_stark         2   28       1   all
## 40               stark_rhodes_stark         2   28       1   all
## 41             rhodey_grunts_groans         2   28       2   all
## 42           grunts_groans_grunting         2   28       2   all
## 43              jarvi_ultron_ultron         2   28       1   all
## 44              grunts_groans_stark         2   28       2   all
## 45                  hulk_hulk_aveng         2   28       2   all
## 46             murder_trillion_atom         2   28       1   all
## 47             trillion_atom_inevit         2   28       1   all
## 48              nebula_nebula_morag         2   28       1   all
## 49              nebula_morag_duplic         2   28       1   all
## 50             repeat_repeat_repeat         2   28       1   all
marvel_corpus_tokens_alt %>% dfm() %>% dfm_wordstem() %>% topfeatures(50)
##       stark       grunt        thor       groan      asgard        loki 
##         361         171         145         142         122         115 
##       jarvi        furi       thano      scream       aveng        whoa 
##         104          94          94          93          91          90 
## s.h.i.e.l.d  spider-man        hulk       quill      ultron       hydra 
##          89          86          80          79          78          76 
##        hank        gasp       groot         huh      cannot        odin 
##          74          72          69          63          62          58 
##     wakanda       ronan   tesseract        beep         ned     colonel 
##          57          54          50          49          48          46 
##      barton    romanoff        pant       yondu         pym        sigh 
##          45          44          44          44          43          43 
##      chuckl   vibranium         kid    t'challa       drone       idiot 
##          40          40          38          38          38          37 
##      gamora        kree  indistinct     natasha     heimdal       stole 
##          37          37          36          36          36          35 
##        here      throne 
##          35          35
# Compare with original
marvel_corpus_tokens %>% dfm() %>% dfm_wordstem() %>% topfeatures( 50 )
##   know     go    get   just  right   like   yeah   come    can    one    got 
##   1312   1179   1054   1019    988    890    870    834    802    773    758 
##    now   okay  gonna     oh   want   need   look  think    hey   time   back 
##    737    707    647    627    609    608    599    586    558    548    532 
##    see   well     us   take   good  thing    guy    man   tell  thank    say 
##    521    517    503    495    488    447    445    413    391    381    377 
##    yes   make  stark   call   toni realli    way   work someth  sorri  peopl 
##    371    365    361    353    349    336    335    329    315    314    313 
##   kill    tri   help  never  littl   give 
##    300    293    286    281    276    274

Marvel Alternate Stopwords IV

marvel_corpus_tokens_alt <- tokens_remove( marvel_corpus_tokens_orig, c(cleanNLP::word_frequency$word[1:50000], contractions, "nbsp" ), padding=F )

marvel_corpus_tokens_alt
## Tokens consisting of 23 documents.
## Ant-Man.And.The.Wasp.txt :
##  [1] "jellybean"      "last-minute"    "mother's"       "subatomic"     
##  [5] "burrowed"       "lased"          "micro-treasure" "show-and-tell" 
##  [9] "world's"        "karapetyan"     "overquote"      "karapetyan"    
## [ ... and 225 more ]
## 
## Ant-Man.txt :
##  [1] "pym"       "pym"       "ferocity"  "pym"       "ferocity"  "pym"      
##  [7] "full-size" "pym"       "peachy"    "weirdest"  "ha-ha"     "peachy"   
## [ ... and 211 more ]
## 
## Avengers.Age.of.Ultron.txt :
##  [1] "STRUCKER"     "INDISTINCTLY" "GRUNTS"       "GRUNTING"     "strucker's"  
##  [6] "GRUNTING"     "loki's"       "strucker"     "GROANS"       "GRUNTS"      
## [11] "GROANING"     "GRUNTING"    
## [ ... and 542 more ]
## 
## Avengers.Endgame.txt :
##  [1] "mm-hmm"      "mm-hmm"      "kiddo"       "hawk-eye"    "soup's"     
##  [6] "tearjerker"  "today's"     "21"          "22"          "infection's"
## [11] "meanie"      "48"         
## [ ... and 252 more ]
## 
## Avengers.Infinity.War.txt :
##  [1] "asgardian"  "22"         "asgard"     "asgardian"  "thanos"    
##  [6] "tesseract"  "brother's"  "tesseract"  "asgard"     "asgardian" 
## [11] "asgardian"  "allfathers"
## [ ... and 256 more ]
## 
## Avengers.txt :
##  [1] "tesseract"   "chitauri"    "selvig"      "tesseract"   "selvig"     
##  [6] "selvig"      "evac"        "half-hour"   "tesseract's" "2"          
## [11] "2"           "tesseract"  
## [ ... and 314 more ]
## 
## [ reached max_ndoc ... 17 more documents ]
marvel_corpus_tokens_alt <- tokens_wordstem( marvel_corpus_tokens_alt )
marvel_corpus_tokens_alt
## Tokens consisting of 23 documents.
## Ant-Man.And.The.Wasp.txt :
##  [1] "jellybean"     "last-minut"    "mother"        "subatom"      
##  [5] "burrow"        "lase"          "micro-treasur" "show-and-tel" 
##  [9] "world"         "karapetyan"    "overquot"      "karapetyan"   
## [ ... and 225 more ]
## 
## Ant-Man.txt :
##  [1] "pym"      "pym"      "feroc"    "pym"      "feroc"    "pym"     
##  [7] "full-siz" "pym"      "peachi"   "weirdest" "ha-ha"    "peachi"  
## [ ... and 211 more ]
## 
## Avengers.Age.of.Ultron.txt :
##  [1] "STRUCKER"     "INDISTINCTLi" "GRUNTS"       "GRUNTING"     "strucker"    
##  [6] "GRUNTING"     "loki"         "strucker"     "GROANS"       "GRUNTS"      
## [11] "GROANING"     "GRUNTING"    
## [ ... and 542 more ]
## 
## Avengers.Endgame.txt :
##  [1] "mm-hmm"   "mm-hmm"   "kiddo"    "hawk-ey"  "soup"     "tearjerk"
##  [7] "today"    "21"       "22"       "infect"   "meani"    "48"      
## [ ... and 252 more ]
## 
## Avengers.Infinity.War.txt :
##  [1] "asgardian" "22"        "asgard"    "asgardian" "thano"     "tesseract"
##  [7] "brother"   "tesseract" "asgard"    "asgardian" "asgardian" "allfath"  
## [ ... and 256 more ]
## 
## Avengers.txt :
##  [1] "tesseract" "chitauri"  "selvig"    "tesseract" "selvig"    "selvig"   
##  [7] "evac"      "half-hour" "tesseract" "2"         "2"         "tesseract"
## [ ... and 314 more ]
## 
## [ reached max_ndoc ... 17 more documents ]
# find frequently co-occuring words (typically compound words)
marvel_corpus_ngram_alt <- tokens_ngrams( marvel_corpus_tokens_alt, n=2 ) %>% dfm()
marvel_corpus_ngram_alt  %>% textstat_frequency( n=50 )
##                    feature frequency rank docfreq group
## 1    spider-man_spider-man        36    1       2   all
## 2              thano_thano        26    2       4   all
## 3            asgard_asgard        25    3       5   all
## 4            ultron_ultron        21    4       1   all
## 5          grunting_groans        18    5       3   all
## 6  s.h.i.e.l.d_s.h.i.e.l.d        16    6       5   all
## 7                drax_drax        14    7       2   all
## 8        t'challa_t'challa        14    7       2   all
## 9            grunts_groans        13    9       4   all
## 10         wakanda_wakanda        13    9       3   all
## 11           groans_grunts        12   11       3   all
## 12           groans_groans        12   11       4   all
## 13         grunts_grunting        11   13       3   all
## 14         groans_grunting        10   14       3   all
## 15         heimdal_heimdal        10   14       2   all
## 16         malekith_aether        10   14       1   all
## 17         beeping_beeping         9   17       2   all
## 18         ant-man_ant-man         8   18       4   all
## 19       groaning_grunting         8   18       3   all
## 20            gasps_groans         8   18       4   all
## 21                     1_2         8   18       4   all
## 22       vibranium_wakanda         8   18       2   all
## 23           grunts_grunts         8   18       2   all
## 24                 pym_pym         7   24       3   all
## 25       romanoff_romanoff         7   24       4   all
## 26            gamora_thano         7   24       3   all
## 27     tesseract_tesseract         7   24       2   all
## 28         klaue_vibranium         7   24       1   all
## 29               hand_hand         7   24       1   all
## 30       dormammu_dormammu         7   24       1   all
## 31             yondu_yondu         7   24       2   all
## 32   indistinct_chattering         7   24       1   all
## 33                 sun_sun         7   24       1   all
## 34          chuckles_gasps         6   34       3   all
## 35           gamora_gamora         6   34       2   all
## 36            gasps_grunts         6   34       2   all
## 37               kree_kree         6   34       3   all
## 38     kamar-taj_kamar-taj         6   34       1   all
## 39         sanctum_sanctum         6   34       1   all
## 40               blip_blip         6   34       1   all
## 41                pym_dyne         5   41       1   all
## 42            groans_gasps         5   41       3   all
## 43                     2_3         5   41       5   all
## 44     vibranium_vibranium         5   41       2   all
## 45           cannot_cannot         5   41       4   all
## 46                     2_2         5   41       3   all
## 47   s.h.i.e.l.d_tesseract         5   41       1   all
## 48        t'challa_wakanda         5   41       1   all
## 49       grunting_grunting         5   41       3   all
## 50           skrull_skrull         5   41       1   all
# find frequently co-occuring words (typically compound words)
marvel_corpus_ngram3_alt <- tokens_ngrams( marvel_corpus_tokens_alt, n=3 ) %>% dfm()
marvel_corpus_ngram3_alt  %>% textstat_frequency( n=50 )
##                             feature frequency rank docfreq group
## 1  spider-man_spider-man_spider-man        18    1       2   all
## 2                    drax_drax_drax         9    2       2   all
## 3        t'challa_t'challa_t'challa         9    2       1   all
## 4              ultron_ultron_ultron         8    4       1   all
## 5              asgard_asgard_asgard         7    5       1   all
## 6                 thano_thano_thano         6    6       3   all
## 7           heimdal_heimdal_heimdal         5    7       2   all
## 8                    hand_hand_hand         4    8       1   all
## 9     kamar-taj_kamar-taj_kamar-taj         4    8       1   all
## 10          sanctum_sanctum_sanctum         4    8       1   all
## 11           grunting_groans_grunts         4    8       1   all
## 12       recalibr_recalibr_recalibr         3   12       1   all
## 13           groans_grunting_groans         3   12       2   all
## 14               gamora_thano_thano         3   12       2   all
## 15             grunts_grunts_groans         3   12       2   all
## 16       dormammu_dormammu_dormammu         3   12       1   all
## 17                   blip_blip_blip         3   12       1   all
## 18             asgard_surtur_asgard         3   12       1   all
## 19             surtur_asgard_asgard         3   12       1   all
## 20                      sun_sun_sun         3   12       1   all
## 21         malekith_aether_malekith         3   12       1   all
## 22           asgard_malekith_aether         3   12       1   all
## 23                   yaga_yaga_yaga         2   23       1   all
## 24                    pym_feroc_pym         2   23       1   all
## 25     ant-man_ant-man_yellowjacket         2   23       1   all
## 26           groans_grunts_groaning         2   23       2   all
## 27              groans_groans_gasps         2   23       2   all
## 28             rhodey_grunts_groans         2   23       2   all
## 29           grunts_groans_grunting         2   23       2   all
## 30           ultron_romanoff_ultron         2   23       1   all
## 31          wakanda_wakanda_wakanda         2   23       2   all
## 32                ultron_man_ultron         2   23       1   all
## 33              jarvi_ultron_ultron         2   23       1   all
## 34           groans_groans_grunting         2   23       2   all
## 35           romanoff_groans_grunts         2   23       1   all
## 36       grunting_groaning_grunting         2   23       2   all
## 37             gamora_gamora_gamora         2   23       1   all
## 38          tesseract_selvig_selvig         2   23       1   all
## 39       wakandan_vibranium_wakanda         2   23       2   all
## 40      vibranium_vibranium_wakanda         2   23       1   all
## 41         wakanda_t'challa_wakanda         2   23       1   all
## 42               nakia_wakanda_bast         2   23       1   all
## 43         t'challa_wakanda_wakanda         2   23       1   all
## 44                   zuri_zuri_zuri         2   23       1   all
## 45            nakia_wakanda_wakanda         2   23       1   all
## 46              shuri_shuri_wakanda         2   23       1   all
## 47           grunts_growls_groaning         2   23       1   all
## 48         binarili_retro-fram_barf         2   23       2   all
## 49                retro-fram_barf_$         2   23       2   all
## 50                       barf_$_611         2   23       2   all
marvel_corpus_tokens_alt %>% dfm() %>% dfm_wordstem() %>% topfeatures(50)
##       grunt       groan      asgard       thano s.h.i.e.l.d  spider-man 
##         171         139         122          94          89          86 
##      ultron      cannot     wakanda        gasp   tesseract        beep 
##          78          62          57          53          50          46 
##    romanoff       yondu         pym   vibranium      chuckl    t'challa 
##          44          44          43          40          39          38 
##      gamora        kree  indistinct     heimdal        here       growl 
##          37          37          36          36          35          34 
##          10        drax      skrull          30         one           2 
##          33          33          31          29          26          26 
##     coulson           1      rhodey       klaue      selvig           � 
##          26          24          24          24          24          23 
##     ant-man      aether          20    strucker   asgardian      xandar 
##          23          23          22          22          22          21 
##     bifrost    dormammu    malekith       world       stark   jotunheim 
##          21          21          21          19          19          19 
##     sokovia         guy 
##          18          18
# Compare with original
marvel_corpus_tokens %>% dfm() %>% dfm_wordstem() %>% topfeatures( 50 )
##   know     go    get   just  right   like   yeah   come    can    one    got 
##   1312   1179   1054   1019    988    890    870    834    802    773    758 
##    now   okay  gonna     oh   want   need   look  think    hey   time   back 
##    737    707    647    627    609    608    599    586    558    548    532 
##    see   well     us   take   good  thing    guy    man   tell  thank    say 
##    521    517    503    495    488    447    445    413    391    381    377 
##    yes   make  stark   call   toni realli    way   work someth  sorri  peopl 
##    371    365    361    353    349    336    335    329    315    314    313 
##   kill    tri   help  never  littl   give 
##    300    293    286    281    276    274